** Importing preprocessed data for ML
clear
import delimited "$maindir/Data/ihwap_full.csv", varnames(1) case(preserve)

drop if sqfeet==0


************ transform/interact some variables to be included in ML algorithms

*** polynomials
foreach x of varlist HDD60 HDD65 CDD75 sqfeet noccupants nwindows nstories ///
	Blower_Pre Real_income nbedrooms tmin tmax precip MainHeatBTU_wins elec_prices gas_prices {
gen `x'_sq = `x'*`x'
}

foreach x of varlist HDD60 HDD65 CDD75 sqfeet noccupants nwindows nstories ///
	Blower_Pre Real_income nbedrooms tmin tmax precip MainHeatBTU_wins elec_prices gas_prices {
gen `x'_cube = `x'*`x'*`x'
}


*** logs - will not transform values that can be negative
foreach x of varlist sqfeet noccupants nwindows nstories ///
	Blower_Pre nbedrooms elec_prices gas_prices {
gen `x'_log = log(`x')
}

foreach x of varlist Real_income precip MainHeatBTU_wins {
gen `x'_log = log(`x' + 0.1)
}

*** standardizing
foreach x of varlist HDD60 HDD65 CDD75 sqfeet noccupants nwindows nstories ///
	Blower_Pre Real_income nbedrooms tmin tmax precip MainHeatBTU_wins elec_prices gas_prices {
egen `x'_std = std(`x')
}

*** some interactions
foreach x of varlist sqfeet noccupants nwindows nstories ///
	Blower_Pre Real_income nbedrooms MainHeatBTU_wins elec_prices gas_prices {
gen tminX`x' = tmin*`x'
}

foreach x of varlist sqfeet noccupants nwindows nstories ///
	Blower_Pre Real_income nbedrooms MainHeatBTU_wins elec_prices gas_prices {
gen tmaxX`x' = tmax*`x'
}

foreach x of varlist sqfeet noccupants nwindows nstories ///
	Blower_Pre Real_income nbedrooms MainHeatBTU_wins elec_prices gas_prices {
gen precipX`x' = precip*`x'
}

*** some bins
foreach x of varlist sqfeet noccupants nwindows nstories ///
	Blower_Pre nbedrooms {
xtile `x'_decile = `x', n(10)
}

*** facotrizing (one hot encoding)
foreach x of varlist ShieldingClass builddate nstories HeatTypeMain ///
	priority nbedrooms end_month end_year ProgramYear {
tabulate `x', gen(tab_`x')
}
/* note: not including CountyID as factor because of memory issues during training stage */


*** export to csv, to be used with ML algorithms in R
export delimited using "$maindir/Data/ihwap_full_flex.csv", replace
clear
